{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Unit Test for Phishing Detection Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-16T22:23:22.484428Z",
     "iopub.status.busy": "2020-10-16T22:23:22.484097Z",
     "iopub.status.idle": "2020-10-16T22:23:23.007578Z",
     "shell.execute_reply": "2020-10-16T22:23:23.006978Z",
     "shell.execute_reply.started": "2020-10-16T22:23:22.484401Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "%load_ext spl2_kernel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract first 10 records from the test dataset as unit test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-16T22:23:35.177805Z",
     "iopub.status.busy": "2020-10-16T22:23:35.177535Z",
     "iopub.status.idle": "2020-10-16T22:23:35.800940Z",
     "shell.execute_reply": "2020-10-16T22:23:35.800336Z",
     "shell.execute_reply.started": "2020-10-16T22:23:35.177783Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/dateutil/parser/_parser.py:1218: UnknownTimezoneWarning: tzname BST identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.\n",
      "  category=UnknownTimezoneWarning)\n",
      "/opt/conda/lib/python3.7/site-packages/dateutil/parser/_parser.py:1218: UnknownTimezoneWarning: tzname EDT identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.\n",
      "  category=UnknownTimezoneWarning)\n",
      "/opt/conda/lib/python3.7/site-packages/dateutil/parser/_parser.py:1218: UnknownTimezoneWarning: tzname EST identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.\n",
      "  category=UnknownTimezoneWarning)\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_json('s3://smle-experiments/datasets/phishing_email/splunk_test.json', lines=True)[0:10]\n",
    "t = [i for i in range(10)]\n",
    "df['_time'] = t\n",
    "df.to_json('./detect_phishing_content.json', orient='records', lines=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SPL2 string to perform model inference for phishing detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-10-16T22:26:41.231248Z",
     "iopub.status.busy": "2020-10-16T22:26:41.230973Z",
     "iopub.status.idle": "2020-10-16T22:26:56.882252Z",
     "shell.execute_reply": "2020-10-16T22:26:56.881749Z",
     "shell.execute_reply.started": "2020-10-16T22:26:41.231226Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "39a07cbd58e74860aa67307b52b6df3c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Finished.                     "
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>start_time</th>\n",
       "      <th>isPhishing</th>\n",
       "      <th>entities</th>\n",
       "      <th>Content</th>\n",
       "      <th>probability</th>\n",
       "      <th>end_time</th>\n",
       "      <th>From</th>\n",
       "      <th>body</th>\n",
       "      <th>Subject</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>TBD</td>\n",
       "      <td>Dear friend.I know that this letter may come t...</td>\n",
       "      <td>0.999971</td>\n",
       "      <td>4</td>\n",
       "      <td>henry kabore  &lt;henry_kabore_10@hotmail.fr&gt;</td>\n",
       "      <td>TBD</td>\n",
       "      <td>This is from Mr Henry Kabore</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   start_time isPhishing entities  \\\n",
       "0           4       True      TBD   \n",
       "\n",
       "                                             Content  probability  end_time  \\\n",
       "0  Dear friend.I know that this letter may come t...     0.999971         4   \n",
       "\n",
       "                                          From body  \\\n",
       "0   henry kabore  <henry_kabore_10@hotmail.fr>  TBD   \n",
       "\n",
       "                           Subject  \n",
       "0   This is from Mr Henry Kabore    "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<spl2_kernel.spl2_runner.SPL2Job at 0x7fc4f7e7c4d0>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%spl2\n",
    "| from read_json(\"s3://smle-experiments/datasets/phishing_email/detect_phishing_content.json\")\n",
    "| eval eventLine=concat(From, \" \", Subject, \" \", Content, \" \", \"                                                                                                                                \")\n",
    "| eval mapC = {\" \":32,\"!\":33,\"\\\"\":34,\"#\":35,\"$$\":36,\"%\":37,\"&\":38,\"'\":39,\"(\":40,\")\":41,\"*\":42,\"+\":43,\",\":44,\"-\":45,\".\":46,\"/\":47,\"0\":48,\"1\":49,\"2\":50,\"3\":51,\"4\":52,\"5\":53,\"6\":54,\"7\":55,\"8\":56,\"9\":57,\":\":58,\";\":59,\"<\":60,\"=\":61,\">\":62,\"?\":63,\"@\":64,\"A\":65,\"B\":66,\"C\":67,\"D\":68,\"E\":69,\"F\":70,\"G\":71,\"H\":72,\"I\":73,\"J\":74,\"K\":75,\"L\":76,\"M\":77,\"N\":78,\"O\":79,\"P\":80,\"Q\":81,\"R\":82,\"S\":83,\"T\":84,\"U\":85,\"V\":86,\"W\":87,\"X\":88,\"Y\":89,\"Z\":90,\"[\":91,\"\\\\\":92,\"]\":93,\"^\":94,\"_\":95,\"`\":96,\"a\":97,\"b\":98,\"c\":99,\"d\":100,\"e\":101,\"f\":102,\"g\":103,\"h\":104,\"i\":105,\"j\":106,\"k\":107,\"l\":108,\"m\":109,\"n\":110,\"o\":111,\"p\":112,\"q\":113,\"r\":114,\"s\":115,\"t\":116,\"u\":117,\"v\":118,\"w\":119,\"x\":120,\"y\":121,\"z\":122,\"{\":123,\"|\":124,\"}\":125,\"~\":126}\n",
    "| eval 'embedding_input:0' = for_each(\n",
    "        iterator(mvrange(1,129), \"i\"),\n",
    "        cast(map_get(mapC, substr(eventLine, i, 1)), \"float\") )\n",
    "| apply_model connection_id=\"\" path=\"s3://smle-experiments/models/xlin/phishing_email\" name=\"phishing_email_v7\" \n",
    "| eval probability = mvindex('dense/Sigmoid:0', 0) \n",
    "| where probability > 0.5\n",
    "| eval start_time = _time, end_time = _time, entities = \"TBD\", body = \"TBD\"\n",
    "| select start_time, end_time, From, entities, body, Subject, Content, isPhishing, probability\n",
    ";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
